library(sp)
library(raster)
library(rgeos)
library(sf)
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
library(leaflet)
library(rnaturalearth)
# load 10 km grid
pg = readRDS("~/Documents/GitHub/purple-air-infiltration/data/grid.RDS")
pg = gBuffer(pg, byid = T, width = 5000, capStyle = "SQUARE")
pg = st_as_sf(pg) %>%
st_transform(4326) %>%
select(ID)
# load land use class
nlcd = read.csv("~/BurkeLab Dropbox/Projects/smoke_PM_prediction/data/2_from_EE/NLCD_areas_10km_grid.csv") %>%
select(grid_id = ID, groups) %>%
mutate(groups = gsub("\\[|\\]", "", groups), # get rid of outer brackets
groups = strsplit(groups, "\\}, \\{")) %>% # split on commas between brackets
unnest(groups, keep_empty = TRUE) %>% # groups is now a list that we want to unnest (i.e. lengthen)
mutate(groups = gsub("\\{|\\}", "", groups)) %>% # drop the extra brackets left behind
separate(groups, into = c("landcover", "area"), sep = ",") %>% # split in commas to get land cover class and area
mutate(landcover = trimws(gsub("landcover=", "", landcover, fixed = TRUE)), # drop "landcover"
area = trimws(gsub("sum=", "", area, fixed = TRUE)) %>% as.numeric, # drop "sum"
landcover = recode(landcover, # recode the landcover variables to their classes
"1.0" = "water",
"2.0" = "developed",
"3.0" = "barren",
"4.0" = "forest",
"5.0" = "shrubland",
"7.0" = "herbaceous",
"8.0" = "cultivated",
"9.0" = "wetlands")) %>%
pivot_wider(names_from = landcover, values_from = area, # make it wider, one row for each grid cell, filling missings with 0s because that land class wasn't in the grid cell
values_fill = 0) %>%
mutate(total = water + developed + barren + forest + shrubland + herbaceous + cultivated + wetlands) %>% # calculate total area for the grid cell
mutate(across(!total & !grid_id, ~.x/total)) # calculate percentages in each landcover class
# get land use class as sf object
pg_nlcd = full_join(pg, nlcd %>% select(ID = grid_id, nlcd = barren), by = "ID")
# plot and color 10 km grid cells missing land use class
ggplot(pg_nlcd) + geom_sf(aes(fill = is.na(nlcd)), color = NA)

729 (0.73%) of our 10 km grid cells are missing NLCD. They are located throughout the border and coast.
# load elevation
elevation = read.csv("~/BurkeLab Dropbox/Projects/smoke_PM_prediction/data/2_from_EE/elevation_avg_sd_10km_grid.csv")
# get elevation as sf object
pg_elevation = full_join(pg, elevation %>% select(ID, elevation = mean), by = "ID")
# plot and color 10 km grid cells missing elevation
ggplot(pg_elevation) + geom_sf(aes(fill = is.na(elevation)), color = NA)

128 (0.13%) of our 10 km grid cells are missing elevation. They are located at the Canadian border, the Gulf of Mexico, and the Atlantic coast.
# load era5-land
era5l = readRDS("~/BurkeLab Dropbox/Data/ERA5/Land/2m_temperature/USA/10km_grid/UTC-0600/daily_mean_of_1-hourly/grid_2m_temperature_daily_mean_2006_01.rds") %>%
filter(date == min(date))
# get era5-land as sf object
pg_era5l = full_join(pg, era5l %>% select(ID = id_grid, era5l = `2m_temperature`), by = "ID")
# plot and color by 10 km grid cells missing era5-land
ggplot(pg_era5l) + geom_sf(aes(fill = is.na(era5l)), color = NA)

31 (0.03%) of our 10 km grid cells are missing ERA5-Land (after matching to NN non-NA within 1 degree). They are located in the Florida Keys.
# get 10 km grid cells missing any of these inputs
pg_na = reduce(list(st_drop_geometry(pg_nlcd),
st_drop_geometry(pg_elevation),
st_drop_geometry(pg_era5l)),
full_join,
by = "ID")
pg_na = full_join(pg, pg_na)
pg_na = pg_na %>% filter(is.na(nlcd) | is.na(elevation) | is.na(era5l))
nrow(pg_na)
[1] 788
In total, 788 of our 10 km grid cells are missing NLCD, elevation, or ERA5-Land.
# load EPA stations
epa = read_sf("~/BurkeLab Dropbox/Data/PM25/epa_station_locations/")
# check if any of those NA grid cells have epa stations in them
pg_epa = epa %>% st_filter(pg_na)
nrow(pg_epa) == 0
[1] TRUE
There are no EPA stations located in the 10 km grid cells missing NLCD, elevation, or ERA5-Land.
ggplot() +
geom_sf(data = pg_na, color = "blue") +
geom_sf(data = epa, color = "green", size = 0.3) +
coord_sf(xlim = c(-125, -63), ylim = c(23, 50))

- Blue = 10 km grid cells missing NLCD, elevation, or ERA5-Land
- Green = EPA stations
# load US landmass
us_land = ne_countries(scale = 10,
country = "United States of America",
returnclass = "sf")
# check if any of those NA grid cells overlap US landmass
pg_na_us_land = pg_na %>% st_filter(us_land)
nrow(pg_na_us_land)
[1] 56
56 (7.11%) of the 10 km grid cells missing NLCD, elevation, or ERA5-Land overlap with US landmass.
ggplot() +
geom_sf(data = us_land, fill = "green") +
geom_sf(data = pg_na, color = "blue", fill = NA) +
coord_sf(xlim = c(-125, -63), ylim = c(23, 50))

- Blue = 10 km grid cells missing NLCD, elevation, or ERA5-Land
- Green = US landmass
ggplot() +
geom_sf(data = us_land, fill = "green") +
geom_sf(data = pg_na_us_land, color = "blue", fill = NA) +
coord_sf(xlim = c(-125, -63), ylim = c(23, 50))

Showing only the grid cells that overlap US landmass.
# load population grid
population = readRDS("~/BurkeLab Dropbox/Data/10km_grid_data/grid_population.RDS")
# get population as sf object
pg_population = full_join(pg, population %>% select(ID = id, population = pop), by = "ID")
# check if any of those NA grid cells have population in them
# caveat: population data is from Gridded Population of the World, so may
# include ex-US population too
pg_na_population = pg_population %>% right_join(st_drop_geometry(pg_na), by = "ID")
pg_na_population %>%
st_drop_geometry() %>%
count(population > 0)
282 (35.79%) of the 10 km grid cells missing NLCD, elevation, or ERA5-Land have population in them. (Caveat: this is based on global population data, so if you only want to count US population, this may not be exactly helpful.)
ggplot() +
geom_sf(data = pg_na_population %>% filter(population > 0),
mapping = aes(fill = population),
color = NA)

They are located at various spots along the border.
# interactive look at the most populated places
populated_places = pg_na_population %>%
filter(population > quantile(population, 0.99)) %>%
arrange(desc(population)) %>%
st_centroid() %>%
st_coordinates() %>%
as.data.frame()
leaflet() %>%
addTiles() %>%
addMarkers(lng = populated_places$X, lat = populated_places$Y)
These are the 10 km grid cells missing NLCD, elevation, or ERA5-Land that have the most population.
---
title: "Check Grid Cells with Missing Inputs (Land Use Class, Elevation, and ERA5-Land)"
output: html_notebook
---

```{r setup, include = F}
knitr::opts_chunk$set(message = F)
knitr::opts_chunk$set(warning = F)
```

```{r load_packages}
library(sp)
library(raster)
library(rgeos)
library(sf)
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
library(leaflet)
library(rnaturalearth)
```

```{r}
# load 10 km grid
pg = readRDS("~/Documents/GitHub/purple-air-infiltration/data/grid.RDS")
pg = gBuffer(pg, byid = T, width = 5000, capStyle = "SQUARE")
pg = st_as_sf(pg) %>% 
  st_transform(4326) %>% 
  select(ID)
```

```{r}
# load land use class
nlcd = read.csv("~/BurkeLab Dropbox/Projects/smoke_PM_prediction/data/2_from_EE/NLCD_areas_10km_grid.csv") %>% 
  select(grid_id = ID, groups) %>% 
  mutate(groups = gsub("\\[|\\]", "", groups), # get rid of outer brackets
         groups = strsplit(groups, "\\}, \\{")) %>% # split on commas between brackets
  unnest(groups, keep_empty = TRUE) %>% # groups is now a list that we want to unnest (i.e. lengthen)
  mutate(groups = gsub("\\{|\\}", "", groups)) %>%  # drop the extra brackets left behind
  separate(groups, into = c("landcover", "area"), sep = ",") %>% # split in commas to get land cover class and area
  mutate(landcover = trimws(gsub("landcover=", "", landcover, fixed = TRUE)), # drop "landcover"
         area = trimws(gsub("sum=", "", area, fixed = TRUE)) %>% as.numeric, # drop "sum"
         landcover = recode(landcover, # recode the landcover variables to their classes
                            "1.0" = "water",
                            "2.0" = "developed",
                            "3.0" = "barren",
                            "4.0" = "forest",
                            "5.0" = "shrubland",
                            "7.0" = "herbaceous",
                            "8.0" = "cultivated",
                            "9.0" = "wetlands")) %>%
  pivot_wider(names_from = landcover, values_from = area, # make it wider, one row for each grid cell, filling missings with 0s because that land class wasn't in the grid cell
              values_fill = 0) %>%
  mutate(total = water + developed + barren + forest + shrubland + herbaceous + cultivated + wetlands) %>% # calculate total area for the grid cell
  mutate(across(!total & !grid_id, ~.x/total))  # calculate percentages in each landcover class

# get land use class as sf object
pg_nlcd = full_join(pg, nlcd %>% select(ID = grid_id, nlcd = barren), by = "ID")
```

```{r}
# plot and color 10 km grid cells missing land use class
ggplot(pg_nlcd) + geom_sf(aes(fill = is.na(nlcd)), color = NA)
```

`r pg_nlcd %>% filter(is.na(nlcd)) %>% nrow()` (`r round((pg_nlcd %>% filter(is.na(nlcd)) %>% nrow())/(pg %>% nrow())*100, 2)`%) of our 10 km grid cells are missing NLCD. They are located throughout the border and coast.

```{r}
# load elevation
elevation = read.csv("~/BurkeLab Dropbox/Projects/smoke_PM_prediction/data/2_from_EE/elevation_avg_sd_10km_grid.csv")

# get elevation as sf object
pg_elevation = full_join(pg, elevation %>% select(ID, elevation = mean), by = "ID")
```

```{r}
# plot and color 10 km grid cells missing elevation
ggplot(pg_elevation) + geom_sf(aes(fill = is.na(elevation)), color = NA)
```

`r pg_elevation %>% filter(is.na(elevation)) %>% nrow()` (`r round((pg_elevation %>% filter(is.na(elevation)) %>% nrow())/(pg %>% nrow())*100, 2)`%) of our 10 km grid cells are missing elevation. They are located at the Canadian border, the Gulf of Mexico, and the Atlantic coast.

```{r}
# load era5-land
era5l = readRDS("~/BurkeLab Dropbox/Data/ERA5/Land/2m_temperature/USA/10km_grid/UTC-0600/daily_mean_of_1-hourly/grid_2m_temperature_daily_mean_2006_01.rds") %>% 
  filter(date == min(date))

# get era5-land as sf object
pg_era5l = full_join(pg, era5l %>% select(ID = id_grid, era5l = `2m_temperature`), by = "ID")
```

```{r}
# plot and color by 10 km grid cells missing era5-land
ggplot(pg_era5l) + geom_sf(aes(fill = is.na(era5l)), color = NA)
```

`r pg_era5l %>% filter(is.na(era5l)) %>% nrow()` (`r round((pg_era5l %>% filter(is.na(era5l)) %>% nrow())/(pg %>% nrow())*100, 2)`%) of our 10 km grid cells are missing ERA5-Land (after matching to NN non-NA within 1 degree). They are located in the Florida Keys.

```{r}
# get 10 km grid cells missing any of these inputs
pg_na = reduce(list(st_drop_geometry(pg_nlcd), 
                    st_drop_geometry(pg_elevation), 
                    st_drop_geometry(pg_era5l)),
               full_join, 
               by = "ID")
pg_na = full_join(pg, pg_na)
pg_na = pg_na %>% filter(is.na(nlcd) | is.na(elevation) | is.na(era5l))
nrow(pg_na)
```

In total, `r pg_na %>% nrow()` of our 10 km grid cells are missing NLCD, elevation, or ERA5-Land.

```{r}
# load EPA stations
epa = read_sf("~/BurkeLab Dropbox/Data/PM25/epa_station_locations/")

# check if any of those NA grid cells have epa stations in them
pg_epa = epa %>% st_filter(pg_na)
nrow(pg_epa) == 0
```

There are `r ifelse((pg_epa %>% nrow()) == 0, "no", pg_epa %>% nrow())` EPA stations located in the 10 km grid cells missing NLCD, elevation, or ERA5-Land.

```{r}
# plot NA grid cells and EPA stations
ggplot() + 
  geom_sf(data = pg_na, color = "blue") + 
  geom_sf(data = epa, color = "green", size = 0.3) + 
  coord_sf(xlim = c(-125, -63), ylim = c(23, 50))
```

*  Blue = 10 km grid cells missing NLCD, elevation, or ERA5-Land
*  Green = EPA stations

```{r}
# load US landmass
us_land = ne_countries(scale = 10,
                       country = "United States of America",
                       returnclass = "sf")

# check if any of those NA grid cells overlap US landmass
pg_na_us_land = pg_na %>% st_filter(us_land)
nrow(pg_na_us_land)
```

`r pg_na_us_land %>% nrow()` (`r round((pg_na_us_land %>% nrow())/(pg_na %>% nrow())*100, 2)`%) of the 10 km grid cells missing NLCD, elevation, or ERA5-Land overlap with US landmass.

```{r}
# plot NA grid cells and US landmass
ggplot() + 
  geom_sf(data = us_land, fill = "green") + 
  geom_sf(data = pg_na, color = "blue", fill = NA) + 
  coord_sf(xlim = c(-125, -63), ylim = c(23, 50))
```

*  Blue = 10 km grid cells missing NLCD, elevation, or ERA5-Land
*  Green = US landmass

```{r}
# plot only where there is overlap
ggplot() + 
  geom_sf(data = us_land, fill = "green") + 
  geom_sf(data = pg_na_us_land, color = "blue", fill = NA) + 
  coord_sf(xlim = c(-125, -63), ylim = c(23, 50))
```

Showing only the grid cells that overlap US landmass.

```{r}
# load population grid
population = readRDS("~/BurkeLab Dropbox/Data/10km_grid_data/grid_population.RDS")

# get population as sf object
pg_population = full_join(pg, population %>% select(ID = id, population = pop), by = "ID")

# check if any of those NA grid cells have population in them
# caveat: population data is from Gridded Population of the World, so may 
# include ex-US population too
pg_na_population = pg_population %>% right_join(st_drop_geometry(pg_na), by = "ID")
pg_na_population %>% 
  st_drop_geometry() %>% 
  count(population > 0)
```

`r pg_na_population %>% filter(population > 0) %>% nrow()` (`r round((pg_na_population %>% filter(population > 0) %>% nrow())/(pg_na_population %>% nrow())*100, 2)`%) of the 10 km grid cells missing NLCD, elevation, or ERA5-Land have population in them. (Caveat: this is based on global population data, so if you only want to count US population, this may not be exactly helpful.)

```{r}
# plot only where there is population
ggplot() + 
  geom_sf(data = pg_na_population %>% filter(population > 0), 
          mapping = aes(fill = population),
          color = NA)
```

They are located at various spots along the border.

```{r}
# interactive look at the most populated places
populated_places = pg_na_population %>% 
  filter(population > quantile(population, 0.99)) %>% 
  arrange(desc(population)) %>% 
  st_centroid() %>% 
  st_coordinates() %>% 
  as.data.frame()
leaflet() %>%
  addTiles() %>% 
  addMarkers(lng = populated_places$X, lat = populated_places$Y)
```

These are the 10 km grid cells missing NLCD, elevation, or ERA5-Land that have the most population.
